home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
SGI Freeware 2002 November
/
SGI Freeware 2002 November - Disc 2.iso
/
dist
/
fw_glimpse.idb
/
usr
/
freeware
/
src
/
glimpse-3.0
/
index
/
glimpse.c.z
/
glimpse.c
Wrap
C/C++ Source or Header
|
1997-09-09
|
30KB
|
921 lines
/* Copyright (c) 1994 Sun Wu, Udi Manber, Burra Gopal. All Rights Reserved. */
/* ./glimpse/index/glimpse.c */
#include "glimpse.h"
#include <stdlib.h>
#include <sys/time.h>
#if ISO_CHAR_SET
#include <locale.h> /* support for 8bit character set:ew@senate.be */
#endif
extern char **environ;
extern int errno;
#if BG_DEBUG
extern FILE *LOGFILE; /* file descriptor for LOG output */
#endif /*BG_DEBUG*/
extern FILE *STATFILE; /* file descriptor for statistical data about indexed files */
extern FILE *MESSAGEFILE; /* file descriptor for important messages meant for the user */
extern char INDEX_DIR[MAX_LINE_LEN];
extern struct stat istbuf;
extern int indexable_char[256];
extern int GenerateHash;
extern int KeepFilenames;
extern int OneFilePerBlock;
extern int IndexNumber;
extern int CountWords;
extern int StructuredIndex;
extern int MAXWORDSPERFILE;
extern int NUMERICWORDPERCENT;
extern int AddToIndex;
extern int DeleteFromIndex;
extern int PurgeIndex;
extern int FastIndex;
extern int BuildDictionary;
extern int BuildDictionaryExisting;
extern int CompressAfterBuild;
extern int IncludeHigherPriority;
extern int FilenamesOnStdin;
extern int UseFilters;
extern int ByteLevelIndex;
/* extern int IndexUnderscore; */
extern int IndexableFile;
extern int MAX_PER_MB, MAX_INDEX_PERCENT;
extern int I_THRESHOLD;
extern int BigHashTable;
extern int IndexEverything;
extern int BuildTurbo;
extern int AddedMaxWordsMessage;
extern int AddedMixedWordsMessage;
extern int file_num;
extern int old_file_num;
extern int new_file_num;
extern int file_id;
extern int part_num;
extern char **name_list[MAXNUM_INDIRECT];
extern int p_table[MAX_PARTITION];
extern int *size_list[MAXNUM_INDIRECT];
extern int p_size_list[];
extern unsigned int *disable_list;
extern int memory_usage;
extern int mask_int[];
extern int REAL_PARTITION, REAL_INDEX_BUF, MAX_ALL_INDEX, FILEMASK_SIZE;
extern struct indices *deletedlist;
extern char sync_path[MAX_LINE_LEN];
extern set_usemalloc(); /* compress/misc.c */
char IProgname[MAX_LINE_LEN];
/*
* Has newnum crossed the boundary of an encoding? This is so rare that we
* needn't optimize it by changing the format of the old index and reusing it.
*/
cross_boundary(oldnum, newnum)
int oldnum, newnum;
{
int ret;
if (oldnum <= 0) return 0;
ret = ( ((oldnum <= MaxNum8bPartition) && (newnum > MaxNum8bPartition)) ||
((oldnum <= MaxNum12bPartition) && (newnum > MaxNum12bPartition)) ||
((oldnum <= MaxNum16bPartition) && (newnum > MaxNum16bPartition)) );
if (ret) fprintf(MESSAGEFILE, "Must change index format. Commencing fresh indexing...\n");
return ret;
}
determine_sync()
{
char S[1024], s1[256], s2[256];
FILE *fp;
int i, ret;
strcpy(sync_path, "sync");
sprintf(S, "exec whereis sync > /tmp/zz.%d", getpid());
system(S);
sprintf(S, "/tmp/zz.%d", getpid());
if ((fp = fopen(S, "r")) == NULL) {
/* printf("11111\n"); */
return 0;
}
if ((ret = fread(S, 1, 1024, fp)) <= 0) {
sprintf(S, "/tmp/zz.%d", getpid());
unlink(S);
fclose(fp);
/* printf("22222\n"); */
return 0;
}
sprintf(s1, "/tmp/zz.%d", getpid());
unlink(s1);
fclose(fp);
/* printf("read: %s\n", S); */
sscanf(S, "%s%s", s1, s2);
/* printf("s1=%s s2=%s\n", s1, s2); */
if (strncmp(s1, "sync", 4)) {
/* printf("33333\n"); */
return 0;
}
if (!strcmp(s2, "") || !strcmp(s2, " ")) {
/* printf("44444\n"); */
return 0;
}
if (strstr(s2, "sync") == NULL) {
/* printf("55555\n"); */
return 0;
}
strcpy(sync_path, s2);
/* printf("Using sync in: %s\n", sync_path); */
return 1;
}
main(argc, argv)
int argc;
char **argv;
{
int pid = getpid();
int i, m = 0;
char *indexdir;
char s[MAX_LINE_LEN], s1[MAX_LINE_LEN];
char working_dir[MAX_LINE_LEN];
FILE *tmpfp;
char hash_file[MAX_LINE_LEN], string_file[MAX_LINE_LEN], freq_file[MAX_LINE_LEN];
char tmpbuf[1024];
struct stat stbuf;
char name[MAX_LINE_LEN];
char outname[MAX_LINE_LEN];
int specialwords, threshold;
int backup;
struct indices *get_removed_indices();
struct timeval tv;
#if ISO_CHAR_SET
setlocale(LC_ALL,""); /* support for 8bit character set: ew@senate.be, Henrik.Martin@eua.ericsson.se */
#endif
BuildDictionary = ON;
set_usemalloc();
srand(pid);
umask(077);
determine_sync();
INDEX_DIR[0] = '\0';
specialwords = threshold = -1; /* so that compute_dictionary can use defaults not visible here */
strncpy(IProgname, argv[0], MAX_LINE_LEN);
memset(size_list, '\0', sizeof(int *) * MAXNUM_INDIRECT); /* free it once partition successfully calculates p_size_list */
memset(name_list, '\0', sizeof(char **) * MAXNUM_INDIRECT);
memset(p_size_list, '\0', sizeof(int) * MAX_PARTITION);
build_filename_hashtable((char *)NULL, 0);
/*
* Process options.
*/
while (argc > 1) {
if (strcmp(argv[1], "-help") == 0) {
return usage(1);
}
#if !BUILDCAST
else if (strcmp(argv[1], "-V") == 0) {
printf("\nThis is glimpseindex version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);
return(0);
}
else if (strcmp(argv[1], "-T") == 0) {
BuildTurbo = ON;
argc --; argv ++;
}
else if (strcmp(argv[1], "-I") == 0) {
IndexableFile = ON;
argc --; argv ++;
}
else if(strcmp(argv[1], "-a") == 0) {
AddToIndex = ON;
argc--; argv++;
}
else if(strcmp(argv[1], "-b") == 0) {
ByteLevelIndex = ON;
argc--; argv++;
}
else if(strcmp(argv[1], "-c") == 0) {
CountWords = ON;
argc--; argv++;
}
else if(strcmp(argv[1], "-d") == 0) {
DeleteFromIndex = ON;
argc --; argv ++;
}
else if(strcmp(argv[1], "-D") == 0) {
PurgeIndex = OFF;
argc --; argv ++;
}
else if(strcmp(argv[1], "-f") == 0) {
FastIndex = ON;
argc--; argv++;
}
else if (strcmp(argv[1], "-o") == 0) {
OneFilePerBlock = ON;
argc --; argv ++;
}
else if (strcmp(argv[1], "-s") == 0) {
StructuredIndex = ON;
argc --; argv ++;
}
else if(strcmp(argv[1], "-z") == 0) {
UseFilters = ON;
argc--; argv++;
}
#else /*!BUILDCAST*/
else if (strcmp(argv[1], "-V") == 0) {
printf("\nThis is buildcast version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);
return(0);
}
else if(strcmp(argv[1], "-C") == 0) {
CompressAfterBuild = ON;
argc --; argv ++;
}
else if(strcmp(argv[1], "-E") == 0) {
BuildDictionaryExisting = ON;
argc --; argv ++;
}
else if (strcmp(argv[1], "-t") == 0) {
if ((argc <= 2) || !(isdigit(argv[2][0]))) {
return usage(1);
}
else {
threshold = atoi(argv[2]);
argc -= 2; argv += 2;
}
}
else if (strcmp(argv[1], "-l") == 0) {
if ((argc <= 2) || !(isdigit(argv[2][0]))) {
return usage(1);
}
else {
specialwords = atoi(argv[2]);
argc -= 2; argv += 2;
}
}
#endif /*!BUILDCAST*/
else if (strcmp(argv[1], "-M") == 0) {
if (argc == 2) {
fprintf(stderr, "-M should be followed by the amount of memory in MB for indexing words\n");
return usage(1);
}
m = atoi(argv[2]);
if (m < 1) {
fprintf(stderr, "Ignoring -M %d (< 1 MB). Using default value of about 2 MB\n", m);
return usage(1);
}
else {
/*
* Calculate I_THRESHOLD approximately. Note: 2*1024*1024*2 / (2*24 + 32 + 12) = 47662, DEF_I_THRESHOLD = 40000, so OK
* N * sizeofindices + N*(avgwordlen + sizeoftoken)/indicespertoken <= mem
* elemsperset = occurrences/indicespertoken
* N <= mem * occurrences / (sizeofindices*indicespertoken + avgwordlen + sizeoftoken)
*/
I_THRESHOLD = m * 1024 * 1024 * (INDICES_PER_TOKEN) /
(INDICES_PER_TOKEN * sizeof(struct indices) + sizeof(struct token) + AVG_WORD_LEN);
fprintf(stderr, "Using %d words as threshold before merge\n", I_THRESHOLD/INDICES_PER_TOKEN);
}
argc -= 2; argv += 2;
}
else if (strcmp(argv[1], "-w") == 0) {
if (argc == 2) {
fprintf(stderr, "-w should be followed by the number of words\n");
return usage(1);
}
MAXWORDSPERFILE = atoi(argv[2]);
argc -= 2; argv += 2;
}
else if (strcmp(argv[1], "-S") == 0) {
if (argc == 2) {
fprintf(stderr, "-S should be followed by the stop list limit\n");
return usage(1);
}
MAX_PER_MB = MAX_INDEX_PERCENT = atoi(argv[2]);
argc -= 2; argv += 2;
}
else if(strcmp(argv[1], "-n") == 0) {
IndexNumber = ON;
if ((argc <= 2) || !(isdigit(argv[2][0]))) { /* -n has no arg */
argc --; argv ++;
}
else {
NUMERICWORDPERCENT = atoi(argv[2]);
if ((NUMERICWORDPERCENT > 100) || (NUMERICWORDPERCENT < 0)) {
fprintf(stderr, "The percentage of numeric words must be in [0..100]\n");
return usage(1);
}
argc-=2; argv+=2;
}
}
else if(strcmp(argv[1], "-h") == 0) {
/* I want to generate .glimpse_filehash and .glimpse_filehash_index */
GenerateHash = ON;
argc --; argv ++;
}
else if(strcmp(argv[1], "-i") == 0) {
IncludeHigherPriority = ON;
argc --; argv ++;
}
else if(strcmp(argv[1], "-k") == 0) {
/* I want to know what files were there before: used in SFS to compute new sets from old ones */
KeepFilenames = ON;
argc --; argv ++;
}
else if (strcmp(argv[1], "-B") == 0) {
BigHashTable = 1;
argc --; argv ++;
}
else if (strcmp(argv[1], "-E") == 0) {
IndexEverything = 1; /* without doing stat tests, etc. */
argc --; argv ++;
}
else if(strcmp(argv[1], "-F") == 0) {
FilenamesOnStdin = ON;
argc--; argv++;
}
/*
else if(strcmp(argv[1], "-u") == 0) {
IndexUnderscore = ON;
argc--; argv++;
}
*/
else if (strcmp(argv[1], "-H") == 0) {
if (argc == 2) {
fprintf(stderr, "-H should be followed by a directory name\n");
return usage(1);
}
strncpy(INDEX_DIR, argv[2], MAX_LINE_LEN);
argc -= 2; argv += 2;
}
else break; /* rest are directory names */
}
BuildTurbo = ON; /* always ON: user can remove .glimpse_turbo if not needed */
/*
* Look for invalid option combos.
*/
if ((argc<=1) && (!FilenamesOnStdin) && !FastIndex) {
return usage(1);
}
if (DeleteFromIndex && (AddToIndex || CountWords || IndexableFile)) {
/* With -f, it is automatic for files not found in OS but present in index; without it, an explicit set of files is required as argument on cmdline */
fprintf(stderr, "-d cannot be used with -I, -a or -c (see man pages)\n");
exit(2);
}
if (ByteLevelIndex) {
if (MAX_PER_MB <= 0) {
fprintf(stderr, "Stop list limit (#of occurrences per MB) '%d' must be > 0\n", MAX_PER_MB);
exit(2);
}
}
else if (OneFilePerBlock) {
if ((MAX_INDEX_PERCENT <= 0) || (MAX_INDEX_PERCENT > 100)) {
fprintf(stderr, "Stop list limit (%% of occurrences in files) '%d' must be in (0, 100]\n", MAX_INDEX_PERCENT);
exit(2);
}
}
/*
* Find the index directory since it is used in all options.
*/
if (INDEX_DIR[0] == '\0') {
if ((indexdir = getenv("HOME")) == NULL) {
getcwd(INDEX_DIR, MAX_LINE_LEN-1);
fprintf(stderr, "Using working-directory '%s' to store index\n\n", INDEX_DIR);
}
else strncpy(INDEX_DIR, indexdir, MAX_LINE_LEN);
}
getcwd(working_dir, MAX_LINE_LEN - 1);
if (-1 == chdir(INDEX_DIR)) {
fprintf(stderr, "Cannot change directory to %s\n", INDEX_DIR);
return usage(0);
}
getcwd(INDEX_DIR, MAX_LINE_LEN - 1); /* must be absolute path name */
chdir(working_dir); /* get back to where you were */
if (IndexableFile) { /* traverse the given directories and output names of files that are indexable on stdout */
partition(argc, argv);
return 0;
}
else {
#if BUILDCAST
printf("\nThis is buildcast version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);
#else /*BUILDCAST*/
printf("\nThis is glimpseindex version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);
#endif /*BUILDCAST*/
}
if (ByteLevelIndex) {
#if 0
/* We'll worry about these things later */
if (AddToIndex || DeleteFromIndex || FastIndex) {
fprintf(stderr, "Fresh indexing recommended: -a, -d and -f are not supported with -b as yet\n");
exit(1);
}
AddToIndex = FastIndex = OFF;
#endif
CountWords = OFF;
OneFilePerBlock = ON;
}
/*
* CONVENTION: all the relevant output is on stdout; warnings/errors are on stderr.
* Initialize / open important files.
*/
read_filters(INDEX_DIR, UseFilters);
freq_file[0] = hash_file[0] = string_file[0] = '\0';
strcpy(freq_file, INDEX_DIR);
strcat(freq_file, "/");
strcat(freq_file, DEF_FREQ_FILE);
strcpy(hash_file, INDEX_DIR);
strcat(hash_file, "/");
strcat(hash_file, DEF_HASH_FILE);
strcpy(string_file, INDEX_DIR);
strcat(string_file, "/");
strcat(string_file, DEF_STRING_FILE);
initialize_tuncompress(string_file, freq_file, 0);
#if BG_DEBUG
sprintf(s, "%s/%s", INDEX_DIR, DEF_LOG_FILE);
if((LOGFILE = fopen(s, "w")) == 0) {
fprintf(stderr, "can't open %s for writing\n", s);
LOGFILE = stderr;
}
#endif /*BG_DEBUG*/
sprintf(s, "%s/%s", INDEX_DIR, DEF_MESSAGE_FILE);
if((MESSAGEFILE = fopen(s, "w")) == 0) {
fprintf(stderr, "can't open %s for writing\n", s);
MESSAGEFILE = stderr;
}
sprintf(s, "%s/%s", INDEX_DIR, DEF_STAT_FILE);
if((STATFILE = fopen(s, "a")) == 0) {
fprintf(stderr, "can't open %s for appending\n", s);
STATFILE = stderr;
}
gettimeofday(&tv, NULL);
#if BUILDCAST
fprintf(STATFILE, "\nThis is buildcast version %s, %s. %s", GLIMPSE_VERSION, GLIMPSE_DATE, ctime(&tv.tv_sec));
#else
fprintf(STATFILE, "\nThis is glimpseindex version %s, %s. %s", GLIMPSE_VERSION, GLIMPSE_DATE, ctime(&tv.tv_sec));
#endif
#if BG_DEBUG
fprintf(LOGFILE, "Index Directory = %s\n\n", INDEX_DIR);
#endif /*BG_DEBUG*/
if (MAXWORDSPERFILE != 0) fprintf(MESSAGEFILE, "Index: maximum number of indexed words per file = %d\n", MAXWORDSPERFILE);
else fprintf(MESSAGEFILE, "Index: maximum number of indexed words per file = infinity\n");
fprintf(MESSAGEFILE, "Index: maximum percentage of numeric words per file = %d\n", NUMERICWORDPERCENT);
set_indexable_char(indexable_char);
#if BUILDCAST
CountWords = ON;
AddToIndex = OFF;
FastIndex = OFF;
/* Save old search-dictionaries */
sprintf(s, "%s/.glimpse_index", INDEX_DIR);
if (!access(s, R_OK)) {
sprintf(s, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
if (-1 == mkdir(s, 0700)) {
fprintf(stderr, "cannot create temporary directory %s\n", s);
return -1;
}
#if SFS_COMPAT
sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE);
sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
rename(s, s1);
#else
sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, INDEX_FILE, INDEX_DIR, pid);
system(s);
#endif
#if SFS_COMPAT
sprintf(s, "%s/%s", INDEX_DIR, P_TABLE);
sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
rename(s, s1);
#else
sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, P_TABLE, INDEX_DIR, pid);
system(s);
#endif
#if SFS_COMPAT
sprintf(s, "%s/%s", INDEX_DIR, NAME_LIST);
sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
rename(s, s1);
#else
sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, NAME_LIST, INDEX_DIR, pid);
system(s);
#endif
#if SFS_COMPAT
sprintf(s, "%s/%s", INDEX_DIR, NAME_LIST_INDEX);
sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
rename(s, s1);
#else
sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, NAME_LIST_INDEX, INDEX_DIR, pid);
system(s);
#endif
#if SFS_COMPAT
sprintf(s, "%s/%s", INDEX_DIR, NAME_HASH);
sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
rename(s, s1);
#else
sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, NAME_HASH, INDEX_DIR, pid);
system(s);
#endif
#if SFS_COMPAT
sprintf(s, "%s/%s", INDEX_DIR, NAME_HASH_INDEX);
sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
rename(s, s1);
#else
sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, NAME_HASH_INDEX, INDEX_DIR, pid);
system(s);
#endif
#if SFS_COMPAT
sprintf(s, "%s/%s", INDEX_DIR, MINI_FILE);
sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
rename(s, s1);
#else
sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, MINI_FILE, INDEX_DIR, pid);
system(s);
#endif
#if SFS_COMPAT
sprintf(s, "%s/%s", INDEX_DIR, DEF_STAT_FILE);
sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
rename(s, s1);
#else
sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, DEF_STAT_FILE, INDEX_DIR, pid);
system(s);
#endif
/* Don't save messages, log, debug, etc. */
sprintf(s, "%s/.glimpse_attributes", INDEX_DIR);
if (!access(s, R_OK)) {
#if SFS_COMPAT
sprintf(s, "%s/%s", INDEX_DIR, ATTRIBUTE_FILE);
sprintf(s1, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
rename(s, s1);
#else
sprintf(s, "exec %s -f %s/%s %s/.glimpse_tempdir.%d\n", SYSTEM_MV, INDEX_DIR, ATTRIBUTE_FILE, INDEX_DIR, pid);
system(s);
#endif
}
}
/* Backup old cast-dictionaries: don't use move since indexing might want to use them */
sprintf(s, "%s/.glimpse_quick", INDEX_DIR);
if (!access(s, R_OK)) { /* there are previous cast dictionaries */
backup = rand();
sprintf(s, "%s/.glimpse_backup.%x", INDEX_DIR, backup);
if (-1 == mkdir(s, 0700)) {
fprintf(stderr, "cannot create backup directory %s\n", s);
return -1;
}
sprintf(s, "exec %s -f %s/.glimpse_quick %s/.glimpse_backup.%x\n", SYSTEM_CP, INDEX_DIR, INDEX_DIR, backup);
system(s);
sprintf(s, "exec %s -f %s/.glimpse_compress %s/.glimpse_backup.%x\n", SYSTEM_CP, INDEX_DIR, INDEX_DIR, backup);
system(s);
sprintf(s, "exec %s -f %s/.glimpse_compress.index %s/.glimpse_backup.%x\n", SYSTEM_CP, INDEX_DIR, INDEX_DIR, backup);
system(s);
sprintf(s, "exec %s -f %s/.glimpse_uncompress %s/.glimpse_backup.%x\n", SYSTEM_CP, INDEX_DIR, INDEX_DIR, backup);
system(s);
sprintf(s, "exec %s -f %s/.glimpse_uncompress.index %s/.glimpse_backup.%x\n", SYSTEM_CP, INDEX_DIR, INDEX_DIR, backup);
system(s);
printf("Saved previous cast-dictionary in %s/.glimpse_backup.%x\n", INDEX_DIR, backup);
}
/* Now index these files, and build new dictionaries */
partition(argc, argv);
initialize_data_structures(file_num);
old_file_num = file_num;
build_index();
cleanup();
save_data_structures();
destroy_filename_hashtable();
uninitialize_common();
uninitialize_tcompress();
uninitialize_tuncompress();
compute_dictionary(threshold, DISKBLOCKSIZE, specialwords, INDEX_DIR);
if (CompressAfterBuild) {
/* For the new compression */
if (!initialize_tcompress(hash_file, freq_file, TC_ERRORMSGS)) goto docleanup;
printf("Compressing files with new dictionary...\n");
/* Use the set of file-names collected during partition() / modified during build_hash */
for(i=0; i<file_num; i++) {
if ((disable_list != NULL) && (disable_list[block2index(i)] & mask_int[i%(8*sizeof(int))])) continue; /* nop since disable_list IS NULL */
strcpy(name, LIST_GET(name_list, i));
tcompress_file(name, outname, TC_REMOVE | TC_EASYSEARCH | TC_OVERWRITE | TC_NOPROMPT);
}
}
docleanup:
/* Restore old search-dictionaries */
sprintf(s, "%s/.glimpse_tempdir.%d/.glimpse_index", INDEX_DIR, pid);
if (!access(s, R_OK)) {
#if SFS_COMPAT
sprintf(s1, "%s/%s", INDEX_DIR, INDEX_FILE);
sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, INDEX_FILE);
rename(s, s1);
sprintf(s1, "%s/%s", INDEX_DIR, P_TABLE);
sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, P_TABLE);
rename(s, s1);
sprintf(s1, "%s/%s", INDEX_DIR, NAME_LIST);
sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, NAME_LIST);
rename(s, s1);
sprintf(s1, "%s/%s", INDEX_DIR, NAME_LIST_INDEX);
sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, NAME_LIST_INDEX);
rename(s, s1);
sprintf(s1, "%s/%s", INDEX_DIR, NAME_HASH);
sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, NAME_HASH);
rename(s, s1);
sprintf(s1, "%s/%s", INDEX_DIR, NAME_HASH_INDEX);
sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, NAME_HASH_INDEX);
rename(s, s1);
sprintf(s1, "%s/%s", INDEX_DIR, MINI_FILE);
sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, MINI_FILE);
rename(s, s1);
sprintf(s1, "%s/%s", INDEX_DIR, DEF_STAT_FILE);
sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, DEF_STAT_FILE);
rename(s, s1);
sprintf(s1, "%s/%s", INDEX_DIR, ATTRIBUTE_FILE);
sprintf(s, "%s/.glimpse_tempdir.%d/%s", INDEX_DIR, pid, ATTRIBUTE_FILE);
rename(s, s1);
#else
sprintf(s, "exec %s -f %s/.glimpse_tempdir.%d/.glimpse_* %s\n", SYSTEM_MV, INDEX_DIR, pid, INDEX_DIR);
system(s);
#endif
sprintf(s, "%s/.glimpse_tempdir.%d", INDEX_DIR, pid);
rmdir(s);
}
printf("\nBuilt new cast-dictionary in %s\n", INDEX_DIR);
#else /*BUILDCAST*/
if (AddToIndex || DeleteFromIndex || FastIndex) {
/* Not handling byte level indices here for now */
int indextype;
sprintf(s, "%s/%s", INDEX_DIR, INDEX_FILE);
if (-1 == stat(s, &istbuf)) {
if (AddToIndex || DeleteFromIndex) {
fprintf(stderr, "Cannot find previous index! Fresh indexing recommended\n", s);
return usage(0);
}
file_num = 0;
file_id = 0;
part_num = 1;
goto fresh_indexing;
}
/* Find out existing index of words and partitions/filenumbers */
if ((indextype = get_index_type(s)) < 0) {
#if 0
fprintf(stderr, "Fresh indexing recommended: -a and -f are not supported with -b as yet\n");
exit(1);
/* we support it now */
#endif
}
file_num = part_num = 0;
sprintf(s, "%s/%s", INDEX_DIR, NAME_LIST);
file_num = get_array_of_lines(s, name_list, MaxNum24bPartition, 1);
initialize_disable_list(file_num);
initialize_data_structures(file_num);
if (!indextype) {
sprintf(s, "%s/%s", INDEX_DIR, P_TABLE);
part_num = get_table(s, p_table, MAX_PARTITION, 1) - 1; /* part_num INCLUDES last partition */
}
else merge_splits();
/* Check for errors, Set OneFilePerBlock */
if ( (file_num <= 0) || (!indextype && (part_num <= 0)) ) {
if (AddToIndex || DeleteFromIndex) {
fprintf(stderr, "Cannot find previous glimpseindex files! Fresh indexing recommended\n");
return usage(0);
}
file_num = 0;
file_id = 0;
part_num = 1;
my_free(disable_list);
disable_list = NULL;
goto fresh_indexing;
}
if (OneFilePerBlock && !indextype) {
fprintf(stderr, "Warning: ignoring option -o: using format of existing index\n");
}
OneFilePerBlock = abs(indextype);
if (indextype < 0) ByteLevelIndex = ON;
/* Used in FastIndex for all existing files, used in AddToIndex/DeleteFromIndex if we are trying to add/remove an existing file */
build_filename_hashtable(name_list, file_num);
#if 0
/* Test if these are inverses of each other */
save_data_structures();
merge_splits();
#endif /*0*/
/*
* FastIndex: set disable-flag for unchanged files: remove AND
* disable non-existent files. Let hole remain in file-names/partitions.
*/
if (FastIndex) {
for (i=0; i<file_num; i++)
if (-1 == stat(LIST_GET(name_list, i), &stbuf)) {
remove_filename(i, -1);
}
else if (((stbuf.st_mode & S_IFMT) == S_IFREG) && (stbuf.st_ctime <= istbuf.st_ctime)) {
/* This is just used as a cache since exclude/include processing is not done here: see dir.c */
disable_list[block2index(i)] |= mask_int[i % (8*sizeof(int))];
}
else {
/* Can't do it for directories since files in it can be modified w/o date reflected in the directory. Same for symlinks. */
LIST_ADD(size_list, i, stbuf.st_size, int);
disable_list[block2index(i)] &= ~(mask_int[i % (8*sizeof(int))]);
}
}
/*
* AddToIndex without FastIndex: disable all existing files, remove those that don't exist now.
* Out of old ones, only ADDED FILES are re-enabled: dir.c
*/
else if (AddToIndex) {
for (i=0; i<file_num; i++) {
if (-1 == stat(LIST_GET(name_list, i), &stbuf)) {
remove_filename(i, -1);
}
else {
LIST_ADD(size_list, i, stbuf.st_size, int); /* ONLY for proper statistics in save_data_structures() */
disable_list[block2index(i)] |= mask_int[i % (8*sizeof(int))];
}
}
}
/* else: DeleteFromIndex without FastIndex: don't touch other files */
old_file_num = file_num;
destroy_data_structures();
/* Put old/new files into partitions/filenumbers */
if (-1 == oldpartition(argc, argv)) {
for(i=0;i<file_num;i++) {
#if BG_DEBUG
memory_usage -= (strlen(LIST_GET(name_list, i)) + 2);
#endif /*BG_DEBUG*/
if (LIST_GET(name_list, i) != NULL) {
my_free(LIST_GET(name_list, i), 0);
LIST_SUREGET(name_list, i) = NULL;
}
}
file_num = 0;
file_id = 0;
for (i=0;i<part_num; i++) {
p_table[i] = 0;
}
part_num = 1;
my_free(disable_list);
disable_list = NULL;
goto fresh_indexing;
}
/* Reindex all the files but use the file-names obtained with oldpartition() */
if (cross_boundary(OneFilePerBlock, file_num)) {
my_free(disable_list);
disable_list = NULL;
}
initialize_data_structures(file_num);
if (!DeleteFromIndex || FastIndex) build_index();
if ((deletedlist = get_removed_indices()) == NULL) new_file_num = file_num;
else if (PurgeIndex) new_file_num = purge_index();
#if BG_DEBUG
fprintf(LOGFILE, "Built indices in %s/%s\n", INDEX_DIR, INDEX_FILE);
#endif /*BG_DEBUG*/
goto docleanup;
}
fresh_indexing:
/* remove it to create space since it can be large: don't need for fresh indexing */
sprintf(s, "%s/%s", INDEX_DIR, P_TABLE);
unlink(s);
/* These should be zeroed since they can confuse fsize and fsize_directory() */
AddToIndex = 0;
FastIndex = 0;
#if BG_DEBUG
fprintf(LOGFILE, "Commencing fresh indexing\n");
#endif /*BG_DEBUG*/
partition(argc, argv);
destroy_filename_hashtable();
initialize_data_structures(file_num);
old_file_num = file_num;
build_index();
#if BG_DEBUG
fprintf(LOGFILE, "\nBuilt indices in %s/%s\n", INDEX_DIR, INDEX_FILE);
#endif /*BG_DEBUG*/
docleanup:
cleanup();
save_data_structures();
destroy_filename_hashtable();
#if BG_DEBUG
fflush(LOGFILE);
fclose(LOGFILE);
#endif /*BG_DEBUG*/
fflush(MESSAGEFILE);
fclose(MESSAGEFILE);
fflush(STATFILE);
fclose(STATFILE);
if (AddedMaxWordsMessage) printf("\nSome files contributed > %d words to the index: check %s\n", MAXWORDSPERFILE, DEF_MESSAGE_FILE);
if (AddedMixedWordsMessage) printf("Some files had numerals in > %d%% of the indexed words: check %s\n", NUMERICWORDPERCENT, DEF_MESSAGE_FILE);
printf("\nIndex-directory: \"%s\"\nGlimpse-files created here:\n", INDEX_DIR);
chdir(INDEX_DIR);
sprintf(s, "exec %s -lg .glimpse_* > /tmp/%d\n", SYSTEM_LS, pid);
system(s);
sprintf(s, "/tmp/%d", pid);
if ((tmpfp = fopen(s, "r")) != NULL) {
memset(tmpbuf, '\0', 1024);
while(fgets(tmpbuf, 1024, tmpfp) != NULL) fputs(tmpbuf, stdout);
fflush(tmpfp);
fclose(tmpfp);
unlink(s);
}
else fprintf(stderr, "cannot open %s to `cat': check %s for .glimpse - files\n", s, INDEX_DIR);
#endif /*BUILDCAST*/
return 0;
}
cleanup()
{
char s[MAX_LINE_LEN];
sprintf(s, "%s/%s", INDEX_DIR, I1);
unlink(s);
sprintf(s, "%s/%s", INDEX_DIR, I2);
unlink(s);
sprintf(s, "%s/%s", INDEX_DIR, I3);
unlink(s);
sprintf(s, "%s/%s", INDEX_DIR, O1);
unlink(s);
sprintf(s, "%s/%s", INDEX_DIR, O2);
unlink(s);
sprintf(s, "%s/%s", INDEX_DIR, O3);
unlink(s);
sprintf(s, "%s/.glimpse_apply.%d", INDEX_DIR, getpid());
unlink(s);
}
#if !BUILDCAST
usage(flag)
int flag;
{
if (flag) fprintf(stderr, "\nThis is glimpseindex version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);
fprintf(stderr, "usage: %s [-help] [-a] [-d] [-f] [-i] [-n [#]] [-o] [-s] [-w #] [-B] [-F] [-H dir] [-I] [-M] [-S lim] [-T] [-V] dirs/files\n", IProgname);
fprintf(stderr, "summary of frequently used options\n(for a more detailed listing see 'man glimpse'):\n");
fprintf(stderr, "-help: outputs this menu\n");
fprintf(stderr, "-a: add given files/dirs to an existing index\n");
fprintf(stderr, "-d: delete given files/dirs from an existing index\n");
fprintf(stderr, "-b: build a (large) byte level index to speed up search\n");
fprintf(stderr, "-f: use modification dates to do fast indexing\n");
fprintf(stderr, "-n #: index numbers; warn if file adds > #%% numeric words: default is 50\n");
fprintf(stderr, "-o: optimize for speed by building a larger index\n");
/* fprintf(stderr, "-s: build the index for structured queries (a1=v1 &/| a2=v2...)\n"); this should not be advertised */
fprintf(stderr, "-w #: warn if a file adds > # words to the index\n");
fprintf(stderr, "-F: expect filenames on stdin (useful for pipelining)\n");
fprintf(stderr, "-H 'dir': .glimpse-files should be in directory 'dir': default is '~'\n");
fprintf(stderr, "-T: build .glimpse_turbo for very fast search with -i -w in glimpse\n");
fprintf(stderr, "\n");
fprintf(stderr, "For questions about glimpse, please contact `%s'\n", GLIMPSE_EMAIL);
exit(1);
}
#else /*!BUILDCAST*/
usage(flag)
int flag;
{
if (flag) fprintf(stderr, "\nThis is buildcast version %s, %s.\n\n", GLIMPSE_VERSION, GLIMPSE_DATE);
fprintf(stderr, "usage: %s [-help] [-t] [-i] [-l] [-n [#]] [-w #] [-C] [-E] [-F] [-H dir] [-V] dirs/files\n", IProgname);
fprintf(stderr, "summary of frequently used options\n(for a more detailed listing see 'man cast'):\n");
fprintf(stderr, "-help: output this menu\n");
fprintf(stderr, "-n #: index numbers; warn if file adds > #%% numeric words: default is 50\n");
fprintf(stderr, "-w #: warn if a file adds > # words to the index\n");
fprintf(stderr, "-C: compress files with the new dictionary after building it\n");
fprintf(stderr, "-E: build cast dictionary using existing compressed files only\n");
fprintf(stderr, "-F: expect filenames on stdin (useful for pipelining)\n");
fprintf(stderr, "-H 'dir': .glimpse-files should be in directory 'dir': default is '~'\n");
fprintf(stderr, "\n");
fprintf(stderr, "For questions about glimpse, please contact `%s'\n", GLIMPSE_EMAIL);
exit(1);
}
#endif /*!BUILDCAST*/